/*	This program creates a working dataset for the Firm Sample centered
at the month of the first worker's 50th birthday.*/


***** Set directories 
local dir_raw 		"~/Dropbox/Retirement gaming/raw/muestra empresas"
local dir_clean 	"~/Dropbox/Retirement gaming/clean"
local dir_do	 	"~/Dropbox/Retirement gaming/dataverse"

local dataname "data_firmsample.dta" 
local dataname2 "firmsample_age50.dta"


********************************
** Import raw firm-level data **
********************************

clear
import delimited "`dir_raw'/cabezales_C.txt", delimiter("|") 

rename v1 aportaci
rename v2 j
rename v3 j2
rename v4 obra
rename v5 Mescargo
rename v6 Tipocontr
rename v7 soc
rename v8 dep
rename v9 pat
rename v10 fechainicio
rename v11 ciiu
rename v12 Dpto
rename v13 exonerac
rename v14 numlocales

gen Dpto2=.
replace Dpto2=	1	 if Dpto=="MONTEVIDEO"
replace Dpto2=	2	 if Dpto=="ARTIGAS"
replace Dpto2=	3	 if Dpto=="CANELONES"
replace Dpto2=	4	 if Dpto=="CERRO LARGO"
replace Dpto2=	5	 if Dpto=="COLONIA"
replace Dpto2=	6	 if Dpto=="DURAZNO"
replace Dpto2=	7	 if Dpto=="FLORES"
replace Dpto2=	8	 if Dpto=="FLORIDA"
replace Dpto2=	9	 if Dpto=="LAVALLEJA"
replace Dpto2=	10	 if Dpto=="MALDONADO"
replace Dpto2=	11	 if Dpto=="PAYSANDU"
replace Dpto2=	12	 if Dpto=="RIO NEGRO"
replace Dpto2=	13	 if Dpto=="RIVERA"
replace Dpto2=	14	 if Dpto=="ROCHA"
replace Dpto2=	15	 if Dpto=="SALTO"
replace Dpto2=	16	 if Dpto=="SAN JOSE"
replace Dpto2=	17	 if Dpto=="SORIANO"
replace Dpto2=	18	 if Dpto=="TACUAREMBO"
replace Dpto2=	19	 if Dpto=="TREINTA Y TRES"

drop Dpto
rename Dpto2 Dpto

replace j="" if j=="DESEMPLEO"
replace j2="" if j2=="DESEMPLEO"
replace j2="" if j2=="MATERNIDAD"
replace j="" if j=="MATERNIDAD"
replace j="" if j=="ENFERMEDAD"
replace j2="" if j2=="ENFERMEDAD"

destring j, replace
destring j2, replace

tostring Mescargo, replace
g t=date(Mescargo, "YM")
replace t=mofd(t)
format t %tm
destring Mescargo, replace

egen dia=ends(fechainicio) , punct(-) head
egen ano=ends(fechainicio) , punct(-) tail
egen anio=ends(ano) , punct(-) tail
egen me=ends(ano) , punct(-) head
drop ano
gen mes=.
tostring mes, replace
replace mes="01" if me=="ENE"
replace mes="02" if me=="FEB"
replace mes="03" if me=="MAR"
replace mes="04" if me=="ABR"
replace mes="05" if me=="MAY"
replace mes="06" if me=="JUN"
replace mes="07" if me=="JUL"
replace mes="08" if me=="AGO"
replace mes="09" if me=="SEP"
replace mes="10" if me=="OCT"
replace mes="11" if me=="NOV"
replace mes="12" if me=="DIC"

gen ii="/"
gen iii="/"

egen tmin=min(t), by(j)

replace anio="2000" if anio =="00" & tmin>=472
replace anio="1900" if anio =="00" & tmin<=468
replace anio="2001" if anio =="01" & tmin>=481
replace anio="1901" if anio =="01" & tmin<=480
replace anio="2002" if anio =="02" & tmin>=504
replace anio="1902" if anio =="02" & tmin<=492
replace anio="2003" if anio =="03" & tmin>=516
replace anio="1903" if anio =="03" & tmin<=504
replace anio="2004" if anio =="04" & tmin>=528
replace anio="1904" if anio =="04" & tmin<=516
replace anio="2005" if anio =="05" & tmin>=540
replace anio="1905" if anio =="05" & tmin<=528
replace anio="2006" if anio =="06" & tmin>=552
replace anio="1906" if anio =="06" & tmin<=540
replace anio="2007" if anio =="07" & tmin>=564
replace anio="1907" if anio =="07" & tmin<=552
replace anio="2008" if anio =="08" & tmin>=576
replace anio="1908" if anio =="08" & tmin<=564
replace anio="2009" if anio =="09" & tmin>=588
replace anio="1909" if anio =="09" & tmin<=576
replace anio="2010" if anio =="10" & tmin>=600
replace anio="1910" if anio =="10" & tmin<600
replace anio="2011" if anio =="11" & tmin>=612
replace anio="1911" if anio =="11" & tmin<612
replace anio="2012" if anio =="12" & tmin>=624
replace anio="1912" if anio =="12" & tmin<624
replace anio="2013" if anio =="13" & tmin>=636
replace anio="1913" if anio =="13" & tmin<=624
replace anio="2014" if anio =="14" & tmin>=648
replace anio="1914" if anio =="14" & tmin<=636
replace anio="2015" if anio =="15" & tmin>=660
replace anio="1915" if anio =="15" & tmin<=648
replace anio="1916" if anio =="16"
replace anio="1917" if anio =="17"
replace anio="1918" if anio =="18"
replace anio="1919" if anio =="19"
replace anio="1920" if anio =="20"
replace anio="1921" if anio =="21"
replace anio="1922" if anio =="22"
replace anio="1923" if anio =="23"
replace anio="1924" if anio =="24"
replace anio="1925" if anio =="25"
replace anio="1926" if anio =="26"
replace anio="1927" if anio =="27"
replace anio="1928" if anio =="28"
replace anio="1929" if anio =="29"
replace anio="1930" if anio =="30"
replace anio="1931" if anio =="31"
replace anio="1932" if anio =="32"
replace anio="1933" if anio =="33"
replace anio="1934" if anio =="34"
replace anio="1935" if anio =="35"
replace anio="1936" if anio =="36"
replace anio="1937" if anio =="37"
replace anio="1938" if anio =="38"
replace anio="1939" if anio =="39"
replace anio="1940" if anio =="40"
replace anio="1941" if anio =="41"
replace anio="1942" if anio =="42"
replace anio="1943" if anio =="43"
replace anio="1944" if anio =="44"
replace anio="1945" if anio =="45"
replace anio="1946" if anio =="46"
replace anio="1947" if anio =="47"
replace anio="1948" if anio =="48"
replace anio="1949" if anio =="49"
replace anio="1950" if anio =="50"
replace anio="1951" if anio =="51"
replace anio="1952" if anio =="52"
replace anio="1953" if anio =="53"
replace anio="1954" if anio =="54"
replace anio="1955" if anio =="55"
replace anio="1956" if anio =="56"
replace anio="1957" if anio =="57"
replace anio="1958" if anio =="58"
replace anio="1959" if anio =="59"
replace anio="1960" if anio =="60"
replace anio="1961" if anio =="61"
replace anio="1962" if anio =="62"
replace anio="1963" if anio =="63"
replace anio="1964" if anio =="64"
replace anio="1965" if anio =="65"
replace anio="1966" if anio =="66"
replace anio="1967" if anio =="67"
replace anio="1968" if anio =="68"
replace anio="1969" if anio =="69"
replace anio="1970" if anio =="70"
replace anio="1971" if anio =="71"
replace anio="1972" if anio =="72"
replace anio="1973" if anio =="73"
replace anio="1974" if anio =="74"
replace anio="1975" if anio =="75"
replace anio="1976" if anio =="76"
replace anio="1977" if anio =="77"
replace anio="1978" if anio =="78"
replace anio="1979" if anio =="79"
replace anio="1980" if anio =="80"
replace anio="1981" if anio =="81"
replace anio="1982" if anio =="82"
replace anio="1983" if anio =="83"
replace anio="1984" if anio =="84"
replace anio="1985" if anio =="85"
replace anio="1986" if anio =="86"
replace anio="1987" if anio =="87"
replace anio="1988" if anio =="88"
replace anio="1989" if anio =="89"
replace anio="1990" if anio =="90"
replace anio="1991" if anio =="91"
replace anio="1992" if anio =="92"
replace anio="1993" if anio =="93"
replace anio="1994" if anio =="94"
replace anio="1995" if anio =="95"
replace anio="1996" if anio =="96"
replace anio="1997" if anio =="97"
replace anio="1998" if anio =="98"
replace anio="1999" if anio =="99"


egen fecha=concat(dia ii mes iii anio)

gen fechaIN=.
replace fechaIN= date(fecha, "DMY")
format fechaIN %td
drop dia mes anio ii iii me fecha fechainicio


compress
drop tmin
compress
egen ci=mode(ciiu), by (j Mescargo) maxmode
egen ap=mode(aportaci), by (j Mescargo) maxmode
egen ti=mode(Tipocontr), by (j Mescargo) maxmode
egen de=sum(dep), by (j Mescargo)
collapse (mean) ci ap ti de, by (j Mescargo)
rename ci ciiu
rename ap aportaci
rename ti Tipocontr
rename de ndep


** Keep small firms  **

* Keep firms which are ever observed with <10 workers
bysort j: egen minndepj=min(ndep)
keep if minndepj<10 
* Drop firms that are ever above 50 workers
bysort j: egen maxndepj=max(ndep)
drop if maxndepj>=50

save "`dir_clean'/firmdata.dta", replace

exit
**********************************
** Import raw worker-level data **
**********************************

forvalues num=1996/2013 {
clear
import delimited "`dir_raw'/lineas_C_`num'.txt", delimiter("|") varnames(nonames)

rename v1 aportaci 
rename v2 j 
rename v3 j2
rename v4 obra
rename v5 tipodoc
rename v6 pais 
rename v7 i 
rename v8 sexo 
rename v9 fNAC 
rename v10 acum
rename v11 Mescargo
rename v12 fechaingr 
rename v13 fechaegr 
rename v14 causal 
rename v15 computo  
rename v16 exone 
rename v17 diasTRA 
rename v18 horasTRA 
rename v19 horasSEM 
rename v20 SegS 
rename v21 vf 
rename v22 tipREM 
rename v23 categ 
rename v24 remC1
rename v25 remC2
rename v26 remC3

gen doc=.
replace doc=1 if tipodoc=="DO"
replace doc=0 if tipodoc!="DO"
drop tipodoc

gen SD=0
replace SD=1 if j=="DESEMPLEO"
replace j="" if j=="DESEMPLEO"
replace j2="" if j2=="DESEMPLEO"

gen mat=0
replace mat=1 if j=="MATERNIDAD"
replace j2="" if j2=="MATERNIDAD"
replace j="" if j=="MATERNIDAD"

gen enf=0
replace enf=1 if j=="ENFERMEDAD"
replace j="" if j=="ENFERMEDAD"
replace j2="" if j2=="MATERNIDAD"

destring j, replace
destring j2, replace

gen Fnac=.
replace Fnac= date(fNAC, "DMY")
format Fnac %td
drop fNAC

gen Fing=.
replace Fing= date(fechaingr, "DMY")
format Fing %td
drop fechaingr
 
gen Fegr=.
replace Fegr= date(fechaegr, "DMY")
format Fegr %td
drop fechaegr

replace sexo=0 if sexo==2

compress

tempfile workerdata`num'
save `workerdata`num'', replace

}

forvalues num=1996/2012 {
append using `workerdata`num''
}

save "`dir_clean'/workerdata.dta", replace


**********************************************************************
* WORKER DATASET MERGED TO SMALL FIRM SAMPLE
**********************************************************************

use "`dir_clean'/workerdata.dta", clear
cap drop _merge
merge m:1 j Mescargo using "`dir_clean'/firmdata.dta"
keep if _merge==3 
drop _merge
compress


* Firm size categorical
label define size_cat 0 "Micro less than 5 " 1 "Micro 5-9" 2 "Small 10-19" 3 "Small 20-49" 4 "Medium 50-249" 5 "Large 250 plus"
foreach X in ndep {
	g `X'_cat=0 if `X' <5 // micro 1 less than 5
	replace `X'_cat=1 if `X'>=5 & `X'<10 // micro 2
	replace `X'_cat=2 if `X'>=10 & `X'<20 // small 1
	replace `X'_cat=3 if `X'>=20 & `X'<50 // small 2
	replace `X'_cat=4 if `X'>=50 & `X'<250 // medium
	replace `X'_cat=5 if `X'>=250  // large
	label values `X'_cat size_cat
}

* dates and age
gen dia="01"
tostring  Mescargo, replace
egen fecha=concat(Mescargo dia)
drop dia
g temp=date(fecha,"YMD")
drop fecha 
destring Mescargo, replace 
gen age=(temp-Fnac)/365
g t=mofd(temp)
format t %tm
g year=year(temp)
g mes=month(temp)

run "`dir_do'/ipc.do"

* real earnings 1000s pesos Dec 2015
egen aw=sum(remC1), by(i j t)
gen W=(aw/1000)/ipc
replace W=. if W==0

* Drop duplicates

egen maxw=max(remC1), by(i j t)
duplicates tag i j t, gen(dup)
egen mina=min(aportaci), by (i j t)
drop if mina==1 & aportaci!=1 & dup!=0
drop mina
drop if remC1<maxw & remC1!=. & maxw!=. & dup!=0
drop maxw
generate aleatorio = (1-(-1))*runiform() + (-1)
egen maxa=max(aleatorio), by (i j t)
drop if aleatorio<maxa & dup!=0
drop maxa dup 
drop aleatorio

duplicates r i j t


* tenure
gen aux=(temp-Fing)/365
egen tminIJ=min(t), by(j i)
gen tenure=.
replace  tenure=aux if tminIJ==t
sort i j t
replace  tenure=tenure[_n-1]+0.0833333 if i==i[_n-1] & j==j[_n-1] &  tenure[_n-1]!=.
drop aux tminIJ
sort i t j 

save "`dir_clean'/`dataname'", replace


*******************************
** Clean data **
*******************************

use "`dir_clean'/`dataname'", clear

** Sample restrictions **

drop if aportaci!=1 

* Keep only dependent workers
keep if vf==12

* Drop if no earnings reported
drop if remC1==.
drop if remC1==0

* Drop earnings outliers (using main sample thresholds)
replace W=. if W<3.604606 
replace W=. if W>81.31013 
drop if W==.

* salaried work observations 
g WS=W if tipREM==1
bysort j t: egen maxtipREMjt=max(tipREM) 
g sampleWSjt=maxtipREMjt==1


* WORKER AGE CENTERED AT 50
*birthdate
format Fnac %td
g birth_month=mofd(Fnac)
format birth_month %tm
g refbday_month=birth_month+(12*50)
format refbday_month %tm
* age centered at 50's birthday
g agemonths_centered = t-refbday_month 
* age
g agemonths=agemonths_centered+12*50
cap drop age
g age=agemonths/12

* Industry codes
cap drop aux
tostring ciiu, g(aux)
g ciiu2=substr(aux,1,2)
destring ciiu2, replace
drop aux

gen 	ciiu1 = 1 if ciiu2<10											// Agriculture and mining	
replace ciiu1 = 2 if (ciiu2>=10 & ciiu2<=33) | ciiu2==95 				// Manufacturing 
replace ciiu1 = 3 if ciiu2>=35 & ciiu2<=39 								// Energy and waste disposal
replace ciiu1 = 4 if ciiu2>=41 & ciiu2<=43  							// Construction
replace ciiu1 = 5 if (ciiu2>=45 & ciiu2<=47) | (ciiu2>=55 & ciiu2<=56)	// wholesale and retail, restaurants, hotels
replace ciiu1 = 6 if (ciiu2>=49 & ciiu2<=53) | ciiu2==61				// transport and communications 
replace ciiu1 = 7 if (ciiu2>=62 & ciiu2<=82) | ciiu2==96				// services
replace ciiu1 = 8 if (ciiu2>=84 & ciiu2<=94) | (ciiu2>=58 & ciiu2<=60)| ciiu2==97 // public admin, social and domestic services

g manufacturing		= ciiu1==2
g retailhospitality	= ciiu1==5 
g transportenergy	= ciiu1==6 | ciiu1==3
g services 			= ciiu1==7 | ciiu1==8 | ciiu1==4 
foreach var in manufacturing retailhospitality transportenergy services {
	replace `var'=. if ciiu2==.
}
label var manufacturing 	"Manufacturing"
label var retailhospitality "Retail, Restaurants, Hotels"
label var transportenergy 	"Transport, Communications, Energy"
label var services 			"Services, Other"

g sector=1 if manufacturing==1
replace sector=2 if retailhospitality==1
replace sector=3 if transportenergy==1
replace sector=4 if services==1

* FIRM SIZE CATEGORIES 
replace ndep_cat=ndep_cat+1
replace ndep_cat=0 if ndep==0
label define ndep_cat 0 "no employees" 1 "1-4" 2 "5-9" 3 "10-19" 4 "20-49" 5 "50-249" 6"250 plus" 
label values ndep_cat ndep_cat

* Gender
rename sexo male
g any=1

* Share male workers
bysort j t: egen sharemale_jt=mean(male)

* Alternative tenure measure (time observed) (to use when tenure is missing)
egen pairij=group(i j)
xtset pairij t
bysort pairij: g timeij=_n
cap drop altten
g altten=(timeij-1)/12
sum tenure altten
sum altten if tenure<0
replace tenure=altten if tenure<0
sum altten if tenure==., det
replace tenure=altten if tenure==.


* Create variables

// Individual worker dummies for 50+
g i50plus_any=agemonths_centered>=0 & any==1 
g i50tenured_any=agemonths_centered>=0 & any==1 & tenure>=1 	
// identify individual observations:  time relative to ref worker turning 50 
g refwrk_rft= agemonths_centered==0 & tenure>=1 // indicator for observation when tenured and age rft (50),  but there could be more than one in close time
bys j: egen ztreateverj=max(refwrk_rft) // indicates firm is treated by instrument (all observations for the firm of refworker)	
bys j: egen nrrefwrk=sum(refwrk_rft)
tab nrrefwrk if refwrk_rft==1
//first case
cap drop aux
bys j: egen aux=min(t) if refwrk_rft==1 // this tells us the first date when a refworker is observed in the firm
g refwrk_rft1= t==aux // this corrects maxtenagejt_any so that only first one observed at that age in the firm is counted 
bys j: egen reft1=min(aux)
format reft1 %tm
//second case
cap drop aux
bys j: egen aux=min(t) if refwrk_rft==1 & refwrk_rft1!=1 // this tells us the second date when a refworker is observed in the firm
g refwrk_rft2= t==aux
bys j: egen reft2=min(aux)
format reft2 %tm
cap drop aux
g diff12=reft2-reft1
//third case
cap drop aux
bys j: egen aux=min(t) if refwrk_rft==1 & refwrk_rft1!=1 & refwrk_rft2!=1 // this tells us the second date when a refworker is observed in the firm
g refwrk_rft3= t==aux
bys j: egen reft3=min(aux)
format reft3 %tm
cap drop aux
g diff13=reft3-reft1
// order individual observations within firm
g refwrk_nr=1 if refwrk_rft1==1
replace refwrk_nr=2 if refwrk_rft2==1 & diff12>(11*12)
// identify firm-worker pairs treated, and dates when pair is still matched
forvalues X=1/2 {
	cap drop aux`X' 		
	g aux`X'=i if refwrk_nr==`X'
	bys j: egen refwrk`X'_id=min(aux`X')
	g refwrk`X'_match=i==refwrk`X'_id
	replace refwrk`X'_match=0 if refwrk`X'_match==.
}
// Reference worker's earnings
forvalues X=1/2 {
	cap drop aux`X' 		
	g refwrk`X'_W=W if refwrk`X'_match==1
	replace refwrk`X'_W=0 if refwrk`X'_match!=1
}
*replace refwrk_nr=2 if refwrk_rft3==1 & diff12<=(10*12) & diff13>(10*12) // 10 years enough for 6 years after and 3 before
// time of firm observations, relative to refworker turning 50
forvalues i=1/2 {
	cap drop aux
	g aux=t if refwrk_nr==`i'
	bys j: egen reftime`i'=min(aux)
	drop aux
	g ztimec`i'=t-reftime`i'
}	

**********************************
* COLLAPSE DATASET BY FIRM-MONTH 
**********************************

collapse (sum) i50* W WS refwrk1_W refwrk2_W ///
		 (count) i  ///
		 (mean) tenure  ///
		 (max) sharemale_jt ndep sampleWSjt ztreateverj refwrk1_match refwrk2_match ///
		 (last) ciiu1 ciiu2 sector year ///
		 (min) ztimec1 ztimec2  ///
		, by(j t)
		
save "`dir_clean'/`dataname2'", replace


**********************************
* Prepare data for analysis 
**********************************

use "`dir_clean'/`dataname2'", clear

xtset j t

** SAMPLE SELECTION AND TREATMENT INSTRUMENTS **

keep if ztreateverj==1

//Drop firms without salaried workers
drop if WS==0

//Generate time relative to event
sum ztimec1 ztimec2
forvalues i=1/2 {
	g zyearsc`i' = floor(ztimec`i'/12)
}
sum zyearsc1 zyearsc2 

// select sample periods
keep if (zyearsc1>=-5 & zyearsc1<=7) | (zyearsc2>=-5 & zyearsc2<=7) 

// Generate time and years relative to each case event
g caseid=1 if (zyearsc1>=-5 & zyearsc1<=7)
replace caseid=2 if caseid==. & (zyearsc2>=-5 & zyearsc2<=7)
g ztimejt=ztimec1 if caseid==1
g zyearsjt=zyearsc1 if caseid==1
replace ztimejt=ztimec2 if caseid==2
replace zyearsjt=zyearsc2 if caseid==2

// Generate time and years relative to each case event -- version with firm-worker matches
g match=(case==1 & refwrk1_match==1)
replace match=1 if case==2 &  refwrk2_match==1
g ztimematchjt=ztimec1 if caseid==1 & refwrk1_match==1
g zyearsmatchjt=zyearsc1 if caseid==1 & refwrk1_match==1
replace ztimematchjt=ztimec2 if caseid==2 & refwrk2_match==1
replace zyearsmatchjt=zyearsc2 if caseid==2 & refwrk2_match==1

// select cases with sufficient months in sample
egen newid=group(j case)
bys newid: egen lifemths=count(t)
sum lifemths
g lifeyrs=lifemths/12
bys case: sum lifeyrs
keep if lifemths>=8*12

//select small firms
bys newid zyearsjt: egen meanyri=mean(i) 
g aux=meanyri if zyearsjt==-1
bys newid: egen meanipreyr=mean(aux)
bys newid: egen avsize=max(meanyri)
keep if meanipreyr<=9 
sum avsize, det
keep if avsize<=9 

*Firm size
g fsize=1 if  meanipreyr<=5
replace fsize=2 if meanipreyr>5 & meanipreyr<=9
replace fsize=3 if meanipreyr>=10

// Earnings of reference worker
g Wr=refwrk1_W if case==1 & match==1
replace Wr=refwrk2_W if case==2 & match==1
sum Wr

** OUTCOMES
g Wp=W/i
g WSp=WS/i
global i50plus_any "Any workers 50+" 
global i "Total workers"
global W "Total wage bill"
global Wp "Earnings per worker"
global WS "Total salaried wage bill"
global Wr "Earnings reference worker"

** DUMMIES AND TRENDS FOR REGRESSIONS
* 50-53 dummy
g zpost5053=zyearsjt>=0 & zyearsjt<4
label var zpost5053 "Ref. Worker Age 50-53"

* Post-50 and 54 dummies
g zpost54=zyearsjt>=4
label var zpost54 "Ref. Worker Age$\geq$54"

* Pre-48 
g zpre48=zyearsjt<=-2
label var zpre48 "Ref. Worker Age$\leq$48"

* Post 50
g zpost50=zyearsjt>=0 
label var zpost50 "Ref. Worker Age$\geq$50"

cap drop zage
g zage=ztimejt/12
gen zage_post50 = zage*zpost50
label var zage "Ref. Worker age trend"
label var zage_post50 "Ref. Worker age$\geq$50 x Age trend"

** SAVE DATASET **
save "`dir_clean'/`dataname2'", replace


 
clear all

exit

 
